"""
Code to preprocess request data
"""

import pandas as pd
import numpy as np
from scipy.sparse import save_npz
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import re
import string


nlp = spacy.load("es_core_news_md")
nltk.download('stopwords')
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')


def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()


def cleanHtml(sentence):
    # removes any html symbols
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence):
    # function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[¿|¡|?|!|\'|"|#]', r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n", " ")
    return cleaned


def keepAlpha(sentence):
    # only keeps letters and numbers
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

stop_words = set(stopwords.words('spanish'))
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)


def removeStopWords(sentence):
    return re_stop_words.sub(" ", sentence)

stemmer = SnowballStemmer("spanish")


def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


def clean_all(df):
    # function that applies all cleaning functions
    print('Processing data... \n')
    df['Clean_Text'] = df['Text'].str.lower()
    df['Clean_Text'] = df['Clean_Text'].apply(cleanHtml)
    print('Cleaning punctuations... \n')
    df['Clean_Text'] = df['Clean_Text'].apply(cleanPunc)
    df['Clean_Text'] = df['Clean_Text'].apply(keepAlpha)
    print('Stemming and removing stop words... \n')
    df['Clean_Text'] = df['Clean_Text'].apply(removeStopWords)
    df['Clean_Text'] = df['Clean_Text'].apply(stemming)
    print('Done! \n')
    return df

# labels that we are interested in
label_cols = [
            'S4_num_areas',
            'S4_num_info_queries', 'S5_distinct_reqs_related',
            'S6_is_formal', 'S6_is_legal', 'S6_is_technical',
            'S6_is_aggressive', 'S7_dummy_Data',
            'S7_dummy_Database', 'S7_dummy_Datum',
            'S7_dummy_Document', 'S7_dummy_MultipleDocuments',
            'S8_dummy_Activities', 'S8_dummy_Budget',
            'S8_dummy_Evaluation', 'S8_dummy_ExternalContracts',
            'S8_dummy_InstStruc', 'S8_dummy_Other', 'S8_dummy_Regulatory',
            'S9_dummy_Academic/Scholarly', 'S9_dummy_Commercial',
            'S9_dummy_Impossible to say', 'S9_dummy_Monitoring',
            'S9_dummy_Personal', 'S10_is_clear',
            'S10_is_competency_of_institution', 'S10_is_public',
            'S10_is_existant', 'S11_dummy_Date',
            'S11_dummy_Document', 'S11_dummy_Institution',
            'S11_dummy_Organization', 'S11_dummy_Person', 'S11_dummy_Place']


vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    tokenizer=tokenize,
    encoding='utf-8',
    use_idf=True,
    smooth_idf=True,
    max_features=None,
    norm='l2',
    max_df=0.5,
    min_df=0.01, strip_accents='unicode',
    sublinear_tf=1
    )


df_all = pd.read_csv('../data_raw/hc_new.csv')
df_all = df_all[['Text'] + label_cols]

all_clean = clean_all(df_all)

x_all = all_clean.Clean_Text
X = vectorizer.fit_transform(x_all)
y_all = all_clean[label_cols]

# Saving the vectorized text as a sparse matrix
# and the labels as a pandas dataframe
save_npz('../data_clean/vectorized_text.npz', X)
y_all.to_csv('../data_clean/labels.csv')
x_all.to_csv('../data_clean/text.csv')
